Máster en Data Science UAH

Tasador de viviendas de alquiler vacacional en París

Notebook #3 - Estudio de la localización

Alumno: Héctor Mateos Oblanca
Tutor: Daniel Rodríguez Pérez

Intro

In [1]:
city = 'paris'
month = '201909'
filename_in = 'src/data/' + city + '-' + month + '-listings-CLEAN.csv'
In [2]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import featuretools as ft
import uuid
import s2sphere as s2
import random
 
import catboost as cb
from kmodes.kmodes import KModes
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict 
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import scipy.spatial as spatial
import plotly.express as px
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode

%run src/utils.py
In [3]:
coefs = {}
metrics = {}

def collect_results(columns, model, method, r2, mae, mse, skip_coef=True):
    # coefs
    if skip_coef != True:
        method_coefs = {}
        if hasattr(model, '__intercept'):
            method_coefs['__intercept'] = model.intercept_
        
        for i in range(len(columns.values)):
            method_coefs[columns.values[i]] = abs(model.coef_[i])
        coefs[method] = method_coefs
        df_coefs = pd.DataFrame(coefs)
        df_coefs = df_coefs.sort_values(by=method, ascending=False)
        display(df_coefs)
    
    # metrics
    metrics[method] = {
        'R2':r2.round(3),
        'MAE':mae.round(3),
        'MSE':mse.round(3)
    }
    
    display(pd.DataFrame(metrics))

def print_feature_importances(method, importances, df):
    feature_score = pd.DataFrame(list(zip(df.dtypes.index, importances)), columns=['Feature','Score'])
    feature_score = feature_score.sort_values(by='Score', 
                                              ascending=True, 
                                              inplace=False, 
                                              kind='quicksort', 
                                              na_position='last')
    
    fig = go.Figure(
        go.Bar(
            x=feature_score['Score'],
            y=feature_score['Feature'],
            orientation='h'
        )
    )
    
    fig.update_layout(
        title=method + " Feature Importance Ranking",
        height=25*len(feature_score)
    )
    
    fig.show()

Carga del dataset

In [4]:
df = pd.read_csv(filename_in)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43586 entries, 0 to 43585
Data columns (total 60 columns):
host_response_time                      43586 non-null object
latitude                                43586 non-null float64
longitude                               43586 non-null float64
property_type                           43586 non-null object
room_type                               43586 non-null object
accommodates                            43586 non-null int64
bathrooms                               43586 non-null float64
bedrooms                                43586 non-null float64
price                                   43586 non-null float64
security_deposit                        43586 non-null float64
cleaning_fee                            43586 non-null float64
guests_included                         43586 non-null int64
extra_people                            43586 non-null float64
minimum_nights_avg_ntm                  43586 non-null float64
maximum_nights_avg_ntm                  43586 non-null float64
number_of_reviews                       43586 non-null int64
number_of_reviews_ltm                   43586 non-null int64
first_review                            43586 non-null object
last_review                             43586 non-null object
review_scores_rating                    43449 non-null float64
review_scores_accuracy                  43444 non-null float64
review_scores_cleanliness               43447 non-null float64
review_scores_checkin                   43435 non-null float64
review_scores_communication             43439 non-null float64
review_scores_location                  43433 non-null float64
review_scores_value                     43433 non-null float64
instant_bookable                        43586 non-null int64
cancellation_policy                     43586 non-null object
reviews_per_month                       43586 non-null float64
neighbourhood                           43586 non-null object
has_wifi                                43586 non-null int64
has_essentials                          43586 non-null int64
has_kitchen                             43586 non-null int64
has_heating                             43586 non-null int64
has_washer                              43586 non-null int64
has_hangers                             43586 non-null int64
has_tv                                  43586 non-null int64
has_hair_dryer                          43586 non-null int64
has_iron                                43586 non-null int64
has_shampoo                             43586 non-null int64
has_laptop_friendly_workspace           43586 non-null int64
has_air_conditioning                    43586 non-null int64
has_hot_water                           43586 non-null int64
has_elevator                            43586 non-null int64
has_refrigerator                        43586 non-null int64
has_dishes_and_silverware               43586 non-null int64
has_microwave                           43586 non-null int64
has_bed_linens                          43586 non-null int64
has_no_stairs_or_steps_to_enter         43586 non-null int64
has_coffee_maker                        43586 non-null int64
has_cooking_basics                      43586 non-null int64
has_family/kid_friendly                 43586 non-null int64
has_long_term_stays_allowed             43586 non-null int64
has_first_aid_kit                       43586 non-null int64
has_oven                                43586 non-null int64
has_stove                               43586 non-null int64
has_license                             43586 non-null int64
activity_months                         43586 non-null float64
income_med_occupation                   43586 non-null float64
price_med_occupation_per_accommodate    43586 non-null float64
dtypes: float64(21), int64(32), object(7)
memory usage: 20.0+ MB

Descarte de características

In [5]:
useful_cols = [
    'accommodates',
    'bathrooms',
    'bedrooms',
    'cancellation_policy',
    'cleaning_fee',
    'extra_people',
    'guests_included',
    'has_air_conditioning',
    'has_bed_linens',
    'has_coffee_maker',
    'has_cooking_basics',
    'has_dishes_and_silverware',
    'has_elevator',
    'has_essentials',
    'has_family/kid_friendly',
    'has_first_aid_kit',
    'has_hair_dryer',
    'has_hangers',
    'has_heating',
    'has_hot_water',
    'has_iron',
    'has_kitchen',
    'has_laptop_friendly_workspace',
    'has_license',
    'has_long_term_stays_allowed',
    'has_microwave',
    'has_no_stairs_or_steps_to_enter',
    'has_oven',
    'has_refrigerator',
    'has_shampoo',
    'has_stove',
    'has_tv',
    'has_washer',
    'has_wifi',
    'instant_bookable',
    'latitude',
    'longitude',
    'maximum_nights_avg_ntm',
    'minimum_nights_avg_ntm',
    'neighbourhood',
    'price',
    'property_type',
    'room_type',
    'security_deposit'
]

useless_cols = [
    'district',
    'income_med_occupation',
    'activity_months',
    'host_response_time',
    'first_review',
    'last_review',
    'number_of_reviews',
    'number_of_reviews_ltm',
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'reviews_per_month'
]

highly_corr_cols = [
    'has_refrigerator', 
    'host_verified_by_selfie'
]

df.drop([*useless_cols, *highly_corr_cols], axis=1, errors='ignore', inplace=True)
df.shape
Out[5]:
(43586, 44)

Nuevas características de localización calculadas

Distancia a puntos de interés

Se calcula para cada propiedad la distancia en kilómetros a diferentes puntos de interés turístico de la ciudad.

In [6]:
pois = [    
    {'name':'eiffel-tower', 'coord':(48.8584, 2.2945)},
    {'name':'louvre', 'coord':(48.8606, 2.3376)},
    {'name':'notre-dame', 'coord':(48.8530, 2.3499)},
    {'name':'arc-de-triomphe', 'coord':(48.8738, 2.2950)},
    {'name':'montmartre', 'coord':(48.8862, 2.3431)}
]
In [7]:
for poi in pois:
    df['dist_' + poi['name']] = df.apply(
        lambda r: get_haversine_distance(
            r['latitude'], 
            r['longitude'], 
            poi['coord']), 
        axis=1)

Clustering de barrios

La característica neighbourhood tiene una cardinalidad muy alta que puede conducir a sobreajuste puesto que en algunos barrios hay pocos datos. Se propone, utilizando clusterización, una característica de cardinalidad intermedia entre barrios y distritos que agrupe barrios similares y que resulte más representativa para el estudio.

In [8]:
km = KModes(n_clusters=15, init='Huang', n_init=10, random_state=42)
df['nb_cluster'] = km.fit_predict(df[['price_med_occupation_per_accommodate', 'neighbourhood']])
clusters = df['nb_cluster'].copy()
df['nb_cluster'] = df['nb_cluster'].apply(lambda x: 'nb_' + str(x))
df.drop(['price_med_occupation_per_accommodate'], axis=1, inplace=True) # solo era para calcular clusters
In [9]:
cluster_map = pd.DataFrame(list(zip(df['neighbourhood'], clusters)), columns=['nb', 'cluster'])
cluster_map.drop_duplicates(inplace=True)

with open('src/geo/' + city + '.neighbourhoods.geojson') as f:
    city_nb = fix_geojson(json.load(f))
    
fig = go.Figure(go.Choroplethmapbox(
    geojson=city_nb,
    locations=cluster_map['nb'], 
    z=cluster_map['cluster'],                   
    colorscale=px.colors.qualitative.Vivid,                                
    marker_opacity=0.5, 
    marker_line_width=0.2
))

fig.update_layout(
    mapbox_style='carto-positron',
    mapbox_zoom=11, 
    mapbox_center={'lat':df['latitude'].mean(), 'lon':df['longitude'].mean()},
    margin={"r":0,"t":0,"l":0,"b":0},
    title='clusters',
    showlegend=False
)

fig.show()

Celdas S2

In [10]:
def get_s2(lat, lng):
    py_cellid = s2.CellId.from_lat_lng(
        s2.LatLng.from_degrees(lat, lng)
    )
    py_cellid = py_cellid.parent(12)
    return 's2_' + str(py_cellid.id())

df['s2'] = df.apply(lambda r: get_s2(r['latitude'], r['longitude']), axis=1)
In [11]:
df_s2 = df[['s2', 'latitude', 'longitude']]
s2_cells = sorted(df_s2['s2'].unique())
random.shuffle(s2_cells)
df_s2['idx'] = df_s2['s2'].apply(lambda x: s2_cells.index(x))
In [12]:
fig314 = go.Figure()

fig314.add_trace(go.Scattermapbox(
    lon=df_s2['longitude'],
    lat=df_s2['latitude'],
    mode='markers',
    marker_color=df_s2['idx'],
    text=df_s2['idx'],
    marker=dict(
        size=5,
        opacity=0.4,
        colorscale='spectral'
    )
))

fig314.update_layout(
    showlegend=False,
    mapbox_style='carto-positron',
    mapbox_zoom=11, 
    mapbox_center={'lat':df['latitude'].mean(), 'lon':df['longitude'].mean()},
    margin={"r":0,"t":0,"l":0,"b":0}
)

fig314.show()

Regiones Voronoi

In [13]:
poi_coords = list(map(lambda x: x['coord'], pois))
vor = spatial.Voronoi(poi_coords)

def get_voronoi_index(row):
    new_point = [row['latitude'], row['longitude']]
    point_index = np.argmin(np.sum((vor.points - new_point)**2, axis=1))
    return 'v_' + str(point_index)

df['voronoi'] = df.apply(lambda r: get_voronoi_index(r), axis=1)
spatial.voronoi_plot_2d(vor)
Out[13]:
In [14]:
df_voronoi = df[['voronoi', 'latitude', 'longitude']]
voronoi_cells = sorted(df_voronoi['voronoi'].unique())
df_voronoi['idx'] = df_voronoi['voronoi'].apply(lambda x: voronoi_cells.index(x))
In [15]:
fig315 = go.Figure()

fig315.add_trace(go.Scattermapbox(
    lon=df_voronoi['longitude'],
    lat=df_voronoi['latitude'],
    mode='markers',
    marker_color=df_voronoi['idx'],
    text=df_voronoi['idx'],
    marker=dict(
        size=5,
        opacity=0.4,
        colorscale='spectral'
    )
))

fig315.add_trace(
    go.Scattermapbox(
        lat=list(map(lambda x: x['coord'][0], pois)),
        lon=list(map(lambda x: x['coord'][1], pois)),
        text=list(map(lambda x: x['name'], pois)),
        mode='markers',
        marker=dict(
            size=8,
            opacity=0.9,
            color='black'
        )
    )
)

fig315.update_layout(
    showlegend=False,
    mapbox_style='carto-positron',
    mapbox_zoom=11, 
    mapbox_center={'lat':df['latitude'].mean(), 'lon':df['longitude'].mean()},
    margin={"r":0,"t":0,"l":0,"b":0}
)

fig315.show()

Conversión de características categóricas en dummies

In [16]:
print(df.shape)
dfd = pd.get_dummies(df)
print(dfd.shape)

target = 'price'
features = list(dfd.columns)
features.remove(target)
(43586, 51)
(43586, 140)

Partición en conjuntos de entrenamiento y test

In [17]:
x_train, x_test, y_train, y_test = train_test_split(
    dfd[features], 
    dfd[target],
    test_size=0.3,
    random_state=42
)

x_train = x_train.astype(float) # prevent conversion warnings

Modelo base: CatBoost

In [18]:
def eval_model(method, cols, df):
    model = cb.CatBoostRegressor(
        verbose=0, 
        random_seed=42, 
        depth=10, 
        iterations=150, 
        learning_rate=0.1
    )
    
    regressor = Pipeline([('model', model)])
    regressor.fit(x_train[cols], y_train)
    y_pred = regressor.predict(x_test[cols])
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    collect_results(cols, model, method, r2, mae, mse, skip_coef=True)
    importances = regressor.named_steps['model'].feature_importances_
    print_feature_importances(method, importances, df[cols])
    return y_pred

Estudio de la localización

In [19]:
neighbourhood_cols = [col for col in dfd if col.startswith('neighbourhood')]
dist_cols = [col for col in dfd if col.startswith('dist_')]
coord_cols = ['latitude', 'longitude']
nb_cluster_cols = [col for col in dfd if col.startswith('nb_cluster_')]
s2_cols = [col for col in dfd if col.startswith('s2_')]
voronoi_cols = [col for col in dfd if col.startswith('voronoi')]

Modelo sin variable geográfica

Este modelo registraría toda la variabilidad de precio que es debida a las propiedades de las viviendas sin considerar caractarísticas geográficas de ningún tipo.

In [20]:
cols = features.copy()
for c in [*neighbourhood_cols, *dist_cols, *coord_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('NO-GEO', cols, dfd)
NO-GEO
MAE 24.681
MSE 1635.394
R2 0.663

Residuos

Se busca si existen zonas con un error positivo o negativo.

  • Lo que se puede asociar con puntos de interés: positivo
  • Zonas que los visitantes prefieren evitar: negativo
In [21]:
x_test['resid'] = y_test - y_pred
plt.hist(x_test['resid'], bins=50)
plt.show()

Residuos outliers

In [22]:
x_test2 = x_test.copy()
x_test2.reset_index(inplace=True)
outliers_idx = get_outliers_iqr(x_test2['resid'])[0]
remove_outliers(x_test2, outliers_idx, 'resid')
outliers between following bounds: -64.16994030169305 58.73672938096226
1020 outliers to be removed with values: [-331.2155293518988, -237.8016404784746, -232.1980900231467, -212.65895296248462, -209.36642351364844, -205.40497432022653, -189.68702737899366, -188.11244155217156, -187.3477148018249, -186.9214815854325, -180.10635305103096, -179.21040542041436, -176.84985866841336, -176.7634662376986, -175.52773539538163, -168.320710094071, -167.23694774284047, -161.21476023805008, -161.11936810280883, -158.77154868626866, -158.63869775527968, -158.04589682265424, -157.16296432998973, -154.77669665463478, -152.32123613356447, -150.8052005756598, -146.02392791459022, -145.14786557059753, -144.46858698021836, -140.39736432516213, -140.16648644159463, -139.76616214873945, -138.6351616170208, -137.03955701864191, -134.96331601575173, -134.16097077335132, -133.1682996460922, -133.05684461589686, -130.47531410266276, -130.4516825798373, -129.39689945369105, -128.18132851388566, -125.63420896505724, -125.13941761391419, -124.91760089091892, -124.23404015265734, -123.86819730930674, -123.54578678227543, -122.04426814289684, -121.41393479980357, -120.84865682164147, -120.65672235871367, -120.51836248585764, -119.94924744645962, -118.67623098760183, -118.19222891913543, -118.1558694667761, -117.574175474187, -117.12935677913768, -116.91903451098679, -115.90195093670873, -115.70546954434366, -111.57211751852094, -110.83783232216963, -110.4282799877711, -109.99242382175893, -109.7664394163613, -109.62737929742968, -108.99833940859352, -108.66659678158553, -108.6130935035126, -108.47629666585715, -108.41166976105137, -108.1844811118043, -107.32092254336845, -107.21275939799307, -106.544663075541, -105.41435237775113, -105.25905761500803, -104.888891444727, -103.52474882085835, -103.2474823418396, -103.11762047056547, -102.5041542203063, -102.35418217183894, -102.1795031762301, -102.03826846097158, -101.97309424355772, -101.48429584202461, -100.80852627588351, -100.7336546114899, -100.55682088142356, -100.52220685793881, -100.19613526345424, -100.19363783274116, -99.87333180194304, -99.86636737349113, -99.36814306351019, -99.2388930621706, -98.64688199170732, -98.08221167245574, -97.80103420518128, -97.54293599461164, -97.16774786735013, -96.92855927446126, -96.75436283628912, -96.73357075717566, -96.23086384072471, -96.00545927550843, -95.62376692883623, -95.47369526347566, -94.99941952835601, -94.93600456694926, -94.84206573090154, -94.34698991247208, -93.25169991886395, -93.1746636054541, -92.95397318036336, -92.47684329550964, -92.45294138861072, -91.85455033454542, -91.62255943490959, -91.27566153400498, -91.27030411024367, -91.11288859120576, -91.03763136721372, -90.75468031249375, -90.47672675292742, -90.23771470814128, -89.50223596181598, -89.24009595407095, -88.97806247782034, -88.51116116533936, -88.44917422273849, -88.06063370605764, -88.05811316029866, -87.82166142663516, -87.66486377930985, -87.57795092670054, -87.22254827366837, -87.01129172581332, -86.54055055999632, -86.49032463449402, -86.36591448823935, -86.26289188482389, -85.78141204770102, -85.31823921085817, -85.15865262878262, -84.94117121057371, -84.67795740492025, -84.53522372843887, -84.48061862433536, -83.82781676709885, -83.47274182972117, -83.28414870530318, -83.01098675878254, -82.86287445942385, -82.86143235118774, -82.67775451320844, -82.62667632324039, -82.32794828759373, -82.31753445396845, -82.27725522382536, -82.25696616808435, -81.94917967112124, -81.91358231100776, -81.8191630867637, -81.67237274808005, -81.57137266508134, -81.40298751452207, -81.35900052489336, -81.2773444894961, -81.14565029763332, -80.91958768474419, -80.7847112003555, -80.76217264134061, -80.67946987388612, -80.21175087646333, -79.6940860833746, -79.61001556842695, -79.36526397165864, -79.36155271370455, -79.2389907645926, -79.10502664534653, -79.07003289443699, -78.72091027073805, -78.12062514787493, -77.66668953996422, -77.5842748253504, -77.30724572542928, -77.25773483816356, -77.19237100814595, -76.77427949791289, -76.77364392188, -76.68386853106125, -76.04720691036906, -75.79124025706147, -75.75690007408713, -75.43868519258126, -74.56019935881264, -74.49209965518946, -74.41843946912294, -74.40987039205176, -74.10228200098535, -73.86939713156548, -73.775214052401, -73.64813123737078, -73.63348236584417, -73.46686099545637, -73.29302374805627, -73.26780380447434, -72.99902723387197, -72.86411845280861, -72.84630294185592, -72.70737896723261, -72.70190473949472, -72.54075558482299, -72.4167595616808, -71.670019529321, -71.581252127001, -71.56431445225621, -71.51373908757708, -71.28403873499127, -71.16061596457007, -71.10453395250417, -71.07515944770523, -70.83448019997084, -70.82776335139104, -70.75389584408401, -70.47540419971045, -70.3727845000733, -70.15878883847355, -70.01872005660286, -70.00851367147627, -69.82823689623483, -69.69625501814414, -69.67303895078476, -69.55737409191175, -69.42828224424144, -69.38417624723988, -69.30658385029216, -69.25692235192889, -69.09346102223682, -69.04709791802097, -69.02490584598351, -68.90981454633052, -68.82918291530672, -68.81981711232615, -68.78075923827512, -68.61748871159523, -68.59829406525833, -68.5735542952666, -68.46220020401378, -68.40904067245958, -68.28094361740216, -68.16574708239983, -68.07350891927885, -68.04810197651744, -68.003976878841, -67.94376709997249, -67.79979507204897, -67.76618766109229, -67.69496570105497, -67.57737780755087, -67.56732422498888, -67.56485170949098, -67.42742565752945, -67.38625957950737, -67.14898996842379, -67.00720457765036, -66.827356135309, -66.77739936392013, -66.6630739308834, -66.64048039460897, -65.93239322069886, -65.89646223589759, -65.80091933948086, -65.70892593396061, -65.65762048408773, -65.62395315892799, -65.6171824572854, -65.55411227716112, -65.47575818936548, -65.14567654813897, -64.96532653857378, -64.95530199257425, -64.88306719234515, -64.73242343027141, -64.62717090894448, -64.45573425015945, -64.39836876106625, -64.39218115554466, -64.36223998695345, -64.17775855985394, 58.768216656687216, 58.83234050944429, 58.90143325996374, 59.004851187243474, 59.044102057896794, 59.05051875655988, 59.09598564831758, 59.24759948069918, 59.27369088938299, 59.3554982875074, 59.392210532726835, 59.49187719640854, 59.63004277767075, 59.64332526430886, 59.75284142268022, 59.76300861632174, 59.80133953074544, 59.91760958949621, 59.98049347757305, 60.00283859800558, 60.07414680006386, 60.097884979108244, 60.10984441784299, 60.171253942911775, 60.23926047648398, 60.33258591489448, 60.394494528758, 60.439638181179674, 60.45529072856357, 60.4617460519907, 60.51422827909866, 60.55701688642762, 60.55725816983072, 60.64881895712699, 60.75614686660718, 60.82239764562641, 60.893402745984105, 61.098749083921575, 61.13134601991041, 61.146752645384645, 61.195574658994474, 61.20186532578107, 61.33416668543012, 61.53382238581892, 61.59495517312406, 61.59879768985685, 61.665308618047234, 61.70136273755209, 61.760385577020855, 61.775025284436566, 61.822449856514154, 61.838576146852034, 61.89403676862139, 61.90955248588696, 62.1932311884149, 62.19361192993149, 62.211505457740344, 62.292344221682924, 62.333003766489924, 62.387257205058646, 62.40203893942352, 62.471197526359845, 62.50381126843469, 62.759846783760054, 62.94544763174872, 63.00387668383797, 63.05007448561888, 63.09616081925532, 63.10458713213458, 63.18303961991339, 63.21067612166685, 63.32311261513449, 63.47343678806476, 63.54023400191036, 63.79283948250196, 63.92577778127543, 63.94027391403144, 63.943866838132436, 63.94651194013517, 63.94660308690656, 63.971512459066304, 64.03406278525276, 64.14132815031519, 64.17552689664006, 64.23289080648478, 64.31415001821925, 64.45212653758045, 64.50618576458697, 64.56900013444209, 64.58369923189693, 64.5975755937479, 64.6010831281737, 64.66711135473793, 64.71555441717527, 64.94492826279546, 64.94828847688822, 64.99629962122644, 65.01728890647652, 65.03222698958388, 65.05148327002296, 65.07116514401834, 65.1231253603063, 65.13655235340138, 65.52600848707166, 65.60903009790172, 65.69361283428162, 65.8290102302729, 65.8325436409539, 65.88223555550837, 65.94609065556385, 66.07524550841204, 66.14340375906633, 66.25679973201943, 66.25853253037624, 66.26912035773582, 66.29911611231529, 66.38022696062255, 66.39997132235283, 66.45569347376562, 66.6105055244939, 66.63267923538872, 66.78570120611634, 66.90625752027195, 66.97599099080327, 67.03768542927116, 67.04236041397087, 67.06123231919608, 67.1175620997692, 67.12634673004848, 67.14524057619829, 67.39031398174433, 67.43490428220782, 67.50585495719794, 67.53337696884094, 67.66068999251553, 67.66713737913867, 67.68156334414165, 67.71286091598444, 67.81168331245307, 67.87103295574497, 67.90361674849798, 67.95946466040138, 67.97038305396637, 67.98439555194379, 68.14974073733114, 68.19565222893154, 68.26004006943936, 68.31806209117161, 68.32974738069471, 68.36518991206691, 68.37540072984753, 68.49905000191146, 68.56119215202864, 68.67636610265563, 68.73629515330532, 68.74723534864015, 68.86644785189615, 68.87750363417337, 68.93425634498524, 68.95687945084607, 68.97677618070338, 69.2932072946775, 69.43263681297009, 69.45284177423225, 69.59925668776889, 69.60661288469286, 69.75228120420755, 69.79907127340846, 70.00056382180772, 70.00066305756519, 70.13767806005836, 70.15473412505663, 70.47895463094402, 70.55050609859362, 70.68893787488614, 70.94458952569234, 70.94497144592921, 70.97210599231542, 71.05417201958913, 71.1115551956122, 71.11737794100993, 71.20572521001014, 71.42175719408515, 71.42759872386114, 71.50080879581746, 71.63291968438244, 71.80875831891757, 71.93398440688125, 71.97741806212346, 71.99211341369693, 72.03261342708716, 72.13689542961654, 72.21002053750249, 72.2708750407316, 72.37215159666388, 72.43100894942944, 72.43905346157905, 72.4623823319289, 72.48610049973038, 72.62235156915351, 72.66573152529999, 72.89409944541326, 72.94347121765874, 73.0701642502055, 73.24396712635793, 73.36509993428245, 73.43353467418635, 73.44965865602884, 73.59809726163638, 73.60448195339862, 73.73365202156172, 73.91073928902398, 73.99392723493219, 74.00093655015753, 74.05048416182561, 74.10096657249827, 74.20186904300186, 74.20529490093107, 74.36947855498335, 74.45375769342063, 74.64738957484117, 74.6797972461017, 74.72114343660834, 74.8075080778278, 74.8091753742828, 74.89717373680827, 74.99454823738664, 75.03047810365402, 75.16112056783486, 75.16394081790057, 75.24462365112268, 75.39921937383973, 75.41183524482841, 75.54334570659834, 75.64338536149582, 75.71961273366912, 75.77119119959872, 76.08076528971725, 76.49091256087883, 76.50491410348111, 76.6214456210107, 76.65906331267254, 77.26505570715871, 77.28148652220136, 77.31169773655681, 77.35173083411408, 77.38186026324114, 77.38917101470575, 77.45071201031477, 77.46349396669858, 77.48379254206228, 77.51083788436992, 77.63396361863464, 77.72953144819597, 77.82585738204332, 77.98987961725311, 77.99047909170045, 78.14432613038161, 78.16276426454786, 78.18700909395642, 78.22148816407031, 78.4324205463679, 78.45844102445241, 78.46346879045271, 78.53288551437024, 78.79377963325985, 78.79616518460759, 78.85662876919567, 78.8807505112012, 78.89380205664301, 78.91555075528555, 78.94089386380068, 79.14596394255827, 79.16574442690238, 79.37596136865267, 79.47950769564518, 79.52358289714363, 79.63265158279697, 79.79858473597136, 79.83100939353842, 80.10325319014059, 80.21668612696914, 80.22142898133032, 80.2947263298027, 80.30246283723588, 80.43866750605382, 80.44649256476444, 80.79343535271322, 80.93922391661482, 81.08758525675144, 81.12262980941269, 81.12796655611295, 81.15782647560411, 81.21067612166685, 81.32427842129516, 81.49875824537287, 81.58939890604421, 81.75246370415442, 81.79149479215894, 81.80483052529134, 81.83365155723547, 81.86808150298504, 81.92998032569645, 82.09466642295753, 82.09857867747817, 82.35449884328693, 82.44775089568799, 82.48581888015005, 82.92466148887539, 83.17569077650498, 83.51944640320309, 83.66030023544084, 83.66545671111476, 83.68066869288938, 83.68341137991973, 84.25671409918976, 84.29485764245223, 84.31430202062177, 84.39311576538735, 84.5203832410171, 84.57897875851172, 84.67281078029288, 84.79721138366939, 84.93687406712806, 84.95483392493729, 84.99197532562229, 85.2656472729324, 85.34622881713463, 85.38830484872884, 85.54134561768696, 85.82668196032785, 85.91949572005325, 85.94692886594493, 85.96421579725441, 86.28327498927763, 86.36115018093696, 86.41940238008596, 86.45775606145676, 86.51508042005362, 86.52543977359784, 86.81335474007788, 86.9672846745687, 87.02014259449487, 87.03970013326051, 87.05890846162958, 87.05992670789564, 87.36693982982376, 87.48091381531762, 87.65574979969108, 87.75133979751529, 87.95244515554629, 87.98352045358843, 88.12658931068637, 88.18410303453481, 88.29136549058222, 88.3936413628409, 88.51014188272678, 88.60589054037887, 88.81918558002282, 88.97979492964414, 88.98074554510318, 89.03947983844085, 89.26266319977688, 89.29590104242207, 89.33318986252424, 89.3549408835892, 89.59894051170264, 89.75424168333947, 89.77510752938053, 89.83339510074664, 89.99550989011688, 90.00363924180508, 90.22279713003533, 90.58419109610368, 90.59084323473792, 91.39972681846174, 91.56416592723673, 91.59342865387237, 91.59628393500776, 91.6730287365593, 91.9261046894523, 92.02051443047887, 92.02815911852622, 92.1091098314175, 92.1346195228689, 92.14880452235282, 92.27596422262553, 92.32708305037205, 92.34384485999158, 92.43495108926149, 92.59652791511138, 92.81402661461495, 92.82797763531354, 92.88811180459942, 93.01613578407486, 93.15600159016311, 93.28905990716811, 93.32780884816609, 93.5277905765185, 93.68900654553013, 93.80267073106752, 93.84471726573804, 94.19055637086295, 94.1909232057128, 94.19918214558307, 94.42221791992813, 94.93717391994747, 94.9811143683436, 95.03819711143069, 95.26616792996167, 95.31422778243866, 95.4556089228785, 95.51917876514625, 95.79753431532525, 96.26555290615939, 96.49800755115163, 96.61790692359155, 96.75728039969289, 96.91124356993618, 96.99870156366143, 97.3027093190548, 97.91480196551217, 97.97893212984462, 98.12288341137968, 98.17401704361978, 98.52987285478747, 98.55865409179216, 98.60286104959744, 98.63221853075846, 98.65176463941233, 98.66433345967644, 98.74237423659281, 98.95550587256616, 99.17839974948585, 99.59965408110406, 99.79241620517504, 99.82871139157929, 99.88792095722357, 99.98819070932845, 100.2474400791564, 100.27212195927359, 100.28832645090253, 100.39038236547606, 100.52993196581954, 100.59776870336333, 100.76473050428741, 101.2354985939639, 101.24417596610486, 101.7426006482072, 101.92056956835401, 102.05640032234407, 102.62560616916278, 102.79455639502808, 103.01849606043602, 103.43958078297088, 103.6123258229949, 103.64879315489114, 103.89666339051516, 103.92343539848036, 103.95619137413892, 104.01946194018494, 104.21727795683711, 104.33900866056553, 104.35402886860118, 104.5710759108129, 104.76276135429814, 104.76989675583283, 104.88654034751289, 104.93943622094106, 105.24644438330111, 105.55621412274428, 105.56600944818422, 105.78136036247399, 106.24588676412029, 107.00413685048682, 107.14406467953842, 107.64391267933743, 107.76289010379747, 108.03010741153555, 108.04263658737365, 108.22566703220332, 108.34114937346729, 108.44899752855942, 108.81780570453931, 108.8397085766077, 108.87005170452461, 109.0184253679231, 109.11806976102184, 109.22675944797322, 109.34976272226928, 109.37829515727398, 109.43275475251313, 109.74536332066762, 109.76422817841427, 109.82824053454847, 110.56581910099581, 110.58094960650118, 110.917993612673, 110.99529184280698, 111.13271869856011, 111.2781900576546, 111.9423996417675, 112.01240165120782, 112.1548227744857, 112.63069475016312, 112.99715913392444, 113.55405521535977, 113.64695092630043, 114.0029917671506, 114.0184865089331, 114.04940132821326, 114.06504230184004, 114.12157951537432, 114.9707129442604, 115.02919107041659, 115.25046917055175, 115.8948546900725, 116.05317488943696, 116.30769211223782, 116.36414631680276, 116.78762546823218, 117.46245108365058, 117.46930572906811, 117.76900658297009, 118.65756743094366, 118.87449789299228, 119.19496654648032, 120.34840548451888, 120.42662180632722, 120.85646603224795, 121.04892272199118, 121.05950288635051, 121.0813236568838, 121.22344307266181, 121.43113857709365, 121.47139288293978, 122.04666035498605, 122.96465749340905, 122.98358815709814, 123.15946868605226, 123.47436041885368, 123.73963165980001, 123.8802130234247, 124.0364198374395, 124.25492571324523, 126.477312664316, 126.53770920286007, 126.6144263449144, 126.93457245059335, 127.25471671065617, 127.72389895654216, 127.73789076049873, 128.09312476476842, 128.12077755622244, 128.55294606868415, 128.5722620286042, 129.0027767260633, 129.00852203804897, 129.05307286710132, 129.62275191924886, 129.62823167683408, 130.26462002271438, 130.42382243075775, 130.4842054175515, 130.57398822369842, 130.90154049907258, 130.9104336232146, 130.9207316505879, 130.94579492089588, 131.9592315860668, 132.16647086439195, 133.4765575128814, 133.7089844160232, 134.05540134392277, 134.09877790095075, 134.12481573471504, 134.22524360739553, 134.77419705849334, 135.18999495690773, 135.23709739759695, 135.44090967584378, 135.51133612999757, 135.92562975845328, 136.1103450849113, 136.49703454206028, 136.52951804680913, 137.07452634972165, 137.1803174871784, 137.65659458937603, 138.64148868962616, 139.5230725867571, 139.59578921680216, 140.26481383576368, 140.2784471281306, 140.63021221383548, 140.98303057426534, 141.02726461545518, 141.14288086083405, 141.70478283831653, 142.4636961981131, 143.00324350664818, 143.11344951597445, 143.8880560866479, 144.33231595432522, 145.54712422711899, 146.41248417373984, 146.60449159760776, 146.68761262529074, 146.77347072816394, 147.22267994010565, 147.4010687282896, 147.74194444385606, 148.14685169168837, 148.40893246331984, 149.06136701405984, 149.23323585681493, 149.50028876406327, 150.58068914008715, 150.95657383962435, 151.7744275850382, 151.83512960498632, 152.5123036591291, 152.92307417411308, 153.28945593422287, 153.50346769965404, 154.26202272248815, 154.99297396249912, 159.19651790548926, 159.3487870218102, 159.4160735999285, 159.91764952148029, 160.1368191431094, 160.51522229353844, 161.89277340518726, 162.03937002865683, 163.04563766925074, 163.53261217906905, 163.987316073163, 164.04736897924397, 164.766609655442, 166.1783195719512, 167.6676316624256, 168.25614109626332, 169.78218501064475, 173.83043657964396, 174.4063358288347, 175.93569195561372, 176.45686095230937, 176.52264717972065, 177.37644917550028, 181.5919058039139, 184.05508630913408, 184.16635143197635, 184.39925487872682, 184.76527825109392, 185.01014816072785, 186.64947940141624, 191.41845232026486, 191.5604882767502, 193.72189290642729, 194.3324617443269, 194.40656379407213, 196.3918076609698, 199.8802374468156, 200.14335968643542, 200.66891041117145, 202.31369292503624, 204.20896584422462, 204.70123605048622, 205.900582345376, 208.40792628621998, 209.58412414020782, 216.03883932689826, 217.11729683203424, 217.1310935380757, 219.94688540988722, 220.53770052445768, 223.5009221200258, 224.09043271252835, 225.68085676615254, 226.03529530440812, 227.30886395727106, 227.97221198343556, 233.71447577832953, 233.87893373821478, 234.7567430127036, 238.7240533018327, 239.0696784156869, 242.8902701877928, 247.46482787986065, 249.2018085050034, 249.5621924163001, 251.61688246251296, 251.6178960597608, 252.98168082273097, 255.09951763021803, 258.5493791267306, 259.40301349281515, 261.49214029199095, 267.0898516465238, 269.8344412219929, 270.78462482564476, 275.82994869393985, 291.5936236844113, 293.22594550365034, 305.7057802694586, 308.1315998623823, 309.39100836211003, 319.8840487988267, 324.14656123469547, 325.8593321015818, 338.56969424281954, 355.9184538944447, 356.42490718226725, 356.5240913813621, 360.6326546345699, 362.9240533018326, 363.5765538223741, 400.0808849725738, 432.48735247719105, 575.5613930281978, 606.5689031371605, 782.5242257035219]
In [23]:
plt.hist(x_test2['resid'], bins=30)
plt.show()
In [24]:
fig1 = go.Figure(
    go.Scattermapbox(
        lon=x_test2['longitude'],
        lat=x_test2['latitude'],
        mode='markers',
        marker_color=x_test2['resid'],
        text=x_test2['resid'],
        marker=dict(
            opacity=0.8,
            colorscale=[
                [0.0, "rgb(165,0,38)"],
                [0.11, "rgb(215,48,39)"],
                [0.22, "rgb(244,109,67)"],
                [0.33, "rgb(253,174,97)"],
                [0.44, "rgb(254,224,144)"],
                [0.55, "rgb(224,243,248)"],
                [0.66, "rgb(171,217,233)"],
                [0.77, "rgb(116,173,209)"],
                [0.88, "rgb(69,117,180)"],
                [1.0, "rgb(49,54,149)"]
            ]
        )
    )
)

fig1.update_layout(
    mapbox_style='carto-positron',
    mapbox_zoom=11, 
    mapbox_center={'lat':x_test2['latitude'].mean(), 'lon':x_test2['longitude'].mean()},
    margin={"r":0,"t":0,"l":0,"b":0}
)

fig1.show()

Coordenadas

In [25]:
cols = features.copy()
for c in [*neighbourhood_cols, *dist_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('COORD', cols, dfd)
NO-GEO COORD
R2 0.663 0.717
MAE 24.681 22.243
MSE 1635.394 1376.964

Barrios

In [26]:
cols = features.copy()
for c in [*dist_cols, *coord_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('NB', cols, dfd)
NO-GEO COORD NB
R2 0.663 0.717 0.699
MAE 24.681 22.243 22.936
MSE 1635.394 1376.964 1460.683

Cluster de barrios

In [27]:
cols = features.copy()
for c in [*neighbourhood_cols, *dist_cols, *coord_cols, *s2_cols, *voronoi_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('CLUSTER-NB', cols, dfd)
NO-GEO COORD NB CLUSTER-NB
R2 0.663 0.717 0.699 0.680
MAE 24.681 22.243 22.936 23.814
MSE 1635.394 1376.964 1460.683 1555.313

Distancias a puntos de interés

In [28]:
cols = features.copy()
for c in [*neighbourhood_cols, *coord_cols, *nb_cluster_cols, *s2_cols, *voronoi_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('DIST', cols, dfd)
NO-GEO COORD NB CLUSTER-NB DIST
R2 0.663 0.717 0.699 0.680 0.714
MAE 24.681 22.243 22.936 23.814 22.173
MSE 1635.394 1376.964 1460.683 1555.313 1388.082

Voronoi

In [29]:
cols = features.copy()
for c in [*neighbourhood_cols, *nb_cluster_cols, *coord_cols, *dist_cols, *s2_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('VORONOI', cols, dfd)
NO-GEO COORD NB CLUSTER-NB DIST VORONOI
R2 0.663 0.717 0.699 0.680 0.714 0.671
MAE 24.681 22.243 22.936 23.814 22.173 24.205
MSE 1635.394 1376.964 1460.683 1555.313 1388.082 1598.586

S2

In [30]:
cols = features.copy()
for c in [*neighbourhood_cols, *nb_cluster_cols, *coord_cols, *dist_cols, *voronoi_cols]:
    if c in cols:
        cols.remove(c)
    
y_pred = eval_model('S2', cols, dfd)
NO-GEO COORD NB CLUSTER-NB DIST VORONOI S2
R2 0.663 0.717 0.699 0.680 0.714 0.671 0.694
MAE 24.681 22.243 22.936 23.814 22.173 24.205 23.146
MSE 1635.394 1376.964 1460.683 1555.313 1388.082 1598.586 1484.095

Automated feature engineering

In [31]:
auto_df = df.copy()
auto_df['auto_id'] = auto_df['price'].apply(lambda x: uuid.uuid1().int)
prices = auto_df['price']
auto_df.drop(['price'], axis=1, inplace=True, errors='ignore')
In [32]:
es = ft.EntitySet(id='airbnb')
es = es.entity_from_dataframe(
    entity_id='main',
    dataframe=auto_df,
    index='auto_id'
)
In [33]:
# available_transform_primitives = ft.primitives.list_primitives()
# print(available_transform_primitives[available_transform_primitives['type'] == 'transform'])

features_df, feature_names = ft.dfs(
    entityset=es,
    target_entity='main',
    trans_primitives=['subtract_numeric'],
    max_depth=2
)

# print(features_df.columns)
In [34]:
auto_df = features_df.copy()
auto_df.reset_index()
auto_df.drop(['auto_id'], axis=1, inplace=True, errors='ignore')

auto_df = pd.get_dummies(auto_df)
print(auto_df.shape)

auto_features = list(auto_df.columns)

x_train, x_test, y_train, y_test = train_test_split(
    auto_df, 
    prices,
    random_state=42
)

x_train = x_train.astype(float) # prevent conversion warnings
(43586, 1042)
In [35]:
y_pred = eval_model('AUTO-FT', auto_features, auto_df)
NO-GEO COORD NB CLUSTER-NB DIST VORONOI S2 AUTO-FT
R2 0.663 0.717 0.699 0.680 0.714 0.671 0.694 0.719
MAE 24.681 22.243 22.936 23.814 22.173 24.205 23.146 22.176
MSE 1635.394 1376.964 1460.683 1555.313 1388.082 1598.586 1484.095 1370.771